library(phyloseq); packageVersion("phyloseq")
## [1] '1.46.0'
library(ggplot2); packageVersion("ggplot2")
## [1] '3.5.1'
library(readr); packageVersion("readr")
## [1] '2.1.5'
library(tidyr); packageVersion("tidyr")
## [1] '1.3.1'
library(purrr); packageVersion("purrr")
## [1] '1.0.2'
library(furrr); packageVersion("furrr")
## Loading required package: future
## Warning: package 'future' was built under R version 4.3.3
## [1] '0.3.1'
library(dplyr); packageVersion("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## [1] '1.1.4'
library(stringr); packageVersion("stringr")
## [1] '1.5.1'
library(forcats); packageVersion("forcats")
## [1] '1.0.0'
library(metacoder); packageVersion("metacoder")
## This is metacoder version 0.3.7 (stable)
## 
## Attaching package: 'metacoder'
## The following object is masked from 'package:ggplot2':
## 
##     map_data
## The following object is masked from 'package:phyloseq':
## 
##     filter_taxa
## [1] '0.3.7'
library(data.table); packageVersion("data.table")
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## [1] '1.15.4'
library(decontam); packageVersion("decontam")
## [1] '1.22.0'
library(Biostrings); packageVersion("Biostrings")
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:data.table':
## 
##     first, second
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
## 
##     shift
## The following object is masked from 'package:metacoder':
## 
##     reverse
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## The following object is masked from 'package:phyloseq':
## 
##     distance
## Loading required package: XVector
## 
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
## 
##     compact
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.3.3
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:metacoder':
## 
##     complement
## The following object is masked from 'package:base':
## 
##     strsplit
## [1] '2.70.3'
library(magick); packageVersion("magick")
## Warning: package 'magick' was built under R version 4.3.3
## Linking to ImageMagick 6.9.12.93
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11
## [1] '2.8.4'
library(vegan); packageVersion("vegan")
## Warning: package 'vegan' was built under R version 4.3.3
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-6.1
## [1] '2.6.6.1'
library(pdftools);packageVersion("pdftools")
## Warning: package 'pdftools' was built under R version 4.3.3
## Using poppler version 23.04.0
## [1] '3.4.1'
library(vegan); packageVersion("vegan")
## [1] '2.6.6.1'
library(grid)
## 
## Attaching package: 'grid'
## The following object is masked from 'package:Biostrings':
## 
##     pattern
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
knitr::opts_knit$set(root.dir = "~/benchmark_demulticoder")

Load outputs from the demulticoder workflow

seed <- 1
set.seed(seed)

asv_matrix_rps10<-read.csv("demulticoder/data/final_asv_abundance_matrix_rps10.csv")
asv_matrix_rps10$dada2_tax <- asv_matrix_rps10$dada2_tax <- gsub("Stramenopila", "Eukaryota--100--Domain;Stramenopila", asv_matrix_rps10$dada2_tax)
asv_matrix_rps10 <- asv_matrix_rps10[, -1]
colnames(asv_matrix_rps10)[3:ncol(asv_matrix_rps10)] <- gsub("_rps10$", "", colnames(asv_matrix_rps10)[3:ncol(asv_matrix_rps10)])

asv_matrix_its<-read.csv("demulticoder/data/final_asv_abundance_matrix_its.csv")
asv_matrix_its$dada2_tax <- gsub("Fungi", "Eukaryota--100--Domain;Fungi", asv_matrix_its$dada2_tax)
asv_matrix_its <- asv_matrix_its[, -1]
colnames(asv_matrix_its)[3:ncol(asv_matrix_rps10)] <- gsub("_its$", "", colnames(asv_matrix_its)[3:ncol(asv_matrix_its)])


#Let's combine these matrices
#For easier analysis, we previously combined the two matrices, and appended domain info to each one so we can make one heat tree for combined dataset
sample_cols_its <- setdiff(names(asv_matrix_its), c("sequence", "dada2_tax"))
sample_cols_rps10 <- setdiff(names(asv_matrix_rps10), c("sequence", "dada2_tax"))

if(!all(sample_cols_its == sample_cols_rps10)) {
  stop("Sample columns do not match between ITS and RPS10 dataframes!")
}

abundance <- rbind(
 asv_matrix_its[, c("sequence", "dada2_tax", sample_cols_its)],  # ITS data
  asv_matrix_rps10[, c("sequence", "dada2_tax", sample_cols_rps10)]  # RPS10 data
)

write.csv(abundance, "demulticoder/data/final_asv_abundance_matrix_combined_demulticoder.csv")

metadata_path <- file.path("demulticoder/data/metadata.csv")
metadata <- read_csv(metadata_path)
## Rows: 174 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): sample_name, well, organism, sample_type
## dbl (3): plate, path_conc, experiment
## lgl (2): flooded, is_ambiguous
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(metadata)
## # A tibble: 174 × 9
##    sample_name plate well  organism flooded path_conc experiment sample_type
##    <chr>       <dbl> <chr> <chr>    <lgl>       <dbl>      <dbl> <chr>      
##  1 S1              1 A01   Cry      TRUE          100          1 Sample     
##  2 S10             1 B02   Cry      TRUE            1          1 Sample     
##  3 S100            2 B02   Cin      TRUE            1          2 Sample     
##  4 S101            2 C02   Plu      FALSE           1          2 Sample     
##  5 S102            2 D02   Cry      TRUE            1          2 Sample     
##  6 S103            2 F02   Cin      FALSE           1          2 Sample     
##  7 S104            2 G02   Plu      FALSE           1          2 Sample     
##  8 S105            2 H02   Cry      TRUE          100          2 Sample     
##  9 S106            2 A03   Plu      FALSE         100          2 Sample     
## 10 S107            2 B03   Cin      TRUE          100          2 Sample     
## # ℹ 164 more rows
## # ℹ 1 more variable: is_ambiguous <lgl>
#Fourth, let's track reads
track_reads_demulticoder_its<-read.csv("demulticoder/data/track_reads_its.csv", row.names = 1)
track_reads_demulticoder_rps10<-read.csv("demulticoder/data/track_reads_rps10.csv", row.names = 1)

Let’s collect some summary stats for each dataset

its_summary_reads<-summary(track_reads_demulticoder_its)
print(its_summary_reads)
##      input           filtered       denoisedF       denoisedR    
##  Min.   :    82   Min.   :    1   Min.   :    1   Min.   :    1  
##  1st Qu.: 23440   1st Qu.:12671   1st Qu.:12375   1st Qu.:12308  
##  Median : 37507   Median :23008   Median :22684   Median :22513  
##  Mean   : 42989   Mean   :25466   Mean   :25212   Mean   :25078  
##  3rd Qu.: 57640   3rd Qu.:31912   3rd Qu.:31614   3rd Qu.:31508  
##  Max.   :136509   Max.   :87421   Max.   :87247   Max.   :87170  
##      merged         nonchim     
##  Min.   :    0   Min.   :    0  
##  1st Qu.:11794   1st Qu.:11794  
##  Median :21770   Median :21770  
##  Mean   :24322   Mean   :24230  
##  3rd Qu.:30406   3rd Qu.:30050  
##  Max.   :86880   Max.   :86868
rps10_summary_reads<-summary(track_reads_demulticoder_rps10)
print(rps10_summary_reads)
##      input           filtered        denoisedF        denoisedR     
##  Min.   :    91   Min.   :    26   Min.   :     7   Min.   :     6  
##  1st Qu.: 37994   1st Qu.: 29199   1st Qu.: 29186   1st Qu.: 29184  
##  Median : 61070   Median : 48085   Median : 47950   Median : 48060  
##  Mean   : 73511   Mean   : 56500   Mean   : 56435   Mean   : 56433  
##  3rd Qu.: 95487   3rd Qu.: 74825   3rd Qu.: 74408   3rd Qu.: 74436  
##  Max.   :263101   Max.   :219382   Max.   :218997   Max.   :219291  
##      merged          nonchim      
##  Min.   :     0   Min.   :     0  
##  1st Qu.: 29172   1st Qu.: 29172  
##  Median : 47653   Median : 47653  
##  Mean   : 56030   Mean   : 55349  
##  3rd Qu.: 73485   3rd Qu.: 73400  
##  Max.   :218521   Max.   :217372

Let’s now retrieve stats but only on well supported taxa

# Function to extract taxonomic levels and count unique entries with bootstrap filtering
# Function to extract taxonomic levels and count unique entries
analyze_taxonomy <- function(data) {
  # Get taxonomy data
  tax_data <- data$dada2_tax
  
  # Split taxonomy string into components
  tax_splits <- strsplit(tax_data, ";")
  
  # Function to safely extract taxonomic level with proper rank checking
  extract_tax_level <- function(tax_array, rank) {
    # Find the position containing the rank
    rank_pos <- grep(rank, tax_array)
    if(length(rank_pos) > 0) {
      # Extract and clean the taxonomy name
      tax <- tax_array[rank_pos]
      # Remove the rank pattern and any leading/trailing whitespace
      cleaned_tax <- trimws(gsub(paste0("--\\d+--", rank), "", tax))
      return(cleaned_tax)
    }
    return(NA)
  }
  
  # Extract each taxonomic level with proper hierarchy checking
  families <- sapply(tax_splits, function(x) extract_tax_level(x, "Family"))
  genera <- sapply(tax_splits, function(x) extract_tax_level(x, "Genus"))
  species <- sapply(tax_splits, function(x) extract_tax_level(x, "Species"))
  
  # Create a data frame to check hierarchy consistency
  tax_df <- data.frame(
    Family = families,
    Genus = genera,
    Species = species,
    stringsAsFactors = FALSE
  )
  
  # Remove NA values before counting
  families <- families[!is.na(families)]
  genera <- genera[!is.na(genera)]
  species <- species[!is.na(species)]
  
  # Count unique entries
  unique_counts <- list(
    families = length(unique(families)),
    genera = length(unique(genera)),
    species = length(unique(species))
  )
  
  # Get prevalence with hierarchy checking
  family_prev <- table(families)
  genus_prev <- table(genera)
  species_prev <- table(species)
  
  # Sort prevalence tables
  family_prev <- sort(family_prev, decreasing = TRUE)
  genus_prev <- sort(genus_prev, decreasing = TRUE)
  species_prev <- sort(species_prev, decreasing = TRUE)
  
  # Print top 5 of each level for verification
  cat("\nTop 5 most prevalent Families:\n")
  print(head(family_prev, 5))
  cat("\nTop 5 most prevalent Genera:\n")
  print(head(genus_prev, 5))
  cat("\nTop 5 most prevalent Species:\n")
  print(head(species_prev, 5))
  
  # Get most prevalent taxa with hierarchy verification
  most_prevalent <- list(
    family = if(length(family_prev) > 0) names(family_prev)[1] else "None found",
    genus = if(length(genus_prev) > 0) names(genus_prev)[1] else "None found",
    species = if(length(species_prev) > 0) names(species_prev)[1] else "None found"
  )
  
  return(list(
    unique_counts = unique_counts,
    most_prevalent = most_prevalent,
    tax_df = tax_df  # Return full taxonomy dataframe for inspection
  ))
}

# Run analysis for both datasets
if(exists("asv_matrix_rps10")) {
  rps10_results <- analyze_taxonomy(asv_matrix_rps10)
}
## 
## Top 5 most prevalent Families:
## families
##      Pythiaceae Peronosporaceae Saprolegniaceae Saplolegniaceae   Lagenidiaceae 
##             171             126              20              10               9 
## 
## Top 5 most prevalent Genera:
## genera
##      Pythium Phytophthora  Aphanomyces  Peronospora  Saprolegnia 
##          152           96           18           16           11 
## 
## Top 5 most prevalent Species:
## species
## Phytophthora_sp.kununurra Phytophthora_citrophthora          Pythium_lutarium 
##                        23                        19                        19 
##        Pythium_dissotocum        Phytophthora_sojae 
##                        14                        13
if(exists("asv_matrix_its")) {
  its_results <- analyze_taxonomy(asv_matrix_its)
}
## 
## Top 5 most prevalent Families:
## families
##         Fungi_fam_Incertae_sedis                  Serendipitaceae 
##                              366                              200 
## Rozellomycota_fam_Incertae_sedis                  Hyaloscyphaceae 
##                              176                              157 
##                       Tuberaceae 
##                              128 
## 
## Top 5 most prevalent Genera:
## genera
##         Fungi_gen_Incertae_sedis                      Serendipita 
##                              366                              196 
## Rozellomycota_gen_Incertae_sedis                            Tuber 
##                              176                              128 
##                      Hyaloscypha 
##                               94 
## 
## Top 5 most prevalent Species:
## species
##             Fungi_sp       Serendipita_sp     Rozellomycota_sp 
##                  366                  190                  176 
##  Tuber_pseudobrumale Archaeorhizomyces_sp 
##                  128                   48

Let’s now use combined CSV files to make a few more plots-this shows how we can easily convert metadata and combined ASV matrix into a another taxmap object

First we will configure the combined taxmap object

#load reference database
rps10_database <- read_fasta("demulticoder/data/rps10_reference_db.fa")
innoc_species <- c(Cin = "Phytophthora_cinnamomi",  Plu = "Phytophthora_plurivora", Cry = "Pythium_cryptoirregulare")
innoc_regex <- paste0('(', paste0(innoc_species, collapse = '|'), ')')
innoc_seqs <- rps10_database[grepl(names(rps10_database), pattern = innoc_regex)]
innoc_seqs <- innoc_seqs[! duplicated(innoc_seqs)]
names(innoc_seqs) <- str_match(names(innoc_seqs), pattern = innoc_regex)[, 2]

iupac_match <- function(asv_chars, ref_chars) {
  map2_lgl(asv_chars, ref_chars, function(asv, ref) grepl(asv, pattern = paste0('[', IUPAC_CODE_MAP[ref], ']+')))
}

# Count number of mismatches in an alignment, allowing for IUPAC codes in reference
align_mismatch <- function(alignment) {
  asv_chars <- strsplit(as.character(alignment@pattern), '')[[1]]
  ref_chars <- strsplit(as.character(alignment@subject), '')[[1]]
  sum(! iupac_match(asv_chars, ref_chars))
}

# Align each sequence to each asv and make table of results"
aligned_data_path <- file.path("demulticoder/data", "infection_aligned_data.rds")

if (file.exists(aligned_data_path)) {
  aligned_data <- readRDS(aligned_data_path)
} else {
  aligned_data <-  future_map_dfr(seq_along(innoc_seqs), function(i) {
    aligned <- lapply(abundance$sequence, function(asv) pairwiseAlignment(pattern = asv, subject = innoc_seqs[i], type = 'global-local'))
    print(paste("Processing species:", names(innoc_seqs)[i]))
    tibble(
      species = names(innoc_seqs)[i],
      align_len = map_dbl(aligned, nchar),
      mismatch = map_dbl(aligned, align_mismatch),
      pid = (align_len - mismatch) / align_len,
      asv_seq = abundance$sequence, 
      ref_seq = innoc_seqs[i],
      alignment = aligned
    )
  })
  saveRDS(aligned_data, file = aligned_data_path)
}

# Convert ASV abundances to proportions for use with matches
abundance_prop <- abundance
abundance_prop[metadata$sample_name] <- lapply(abundance_prop[metadata$sample_name], function(counts) counts/sum(counts))

innoculum_asv_mismatch_threshold <-1
# Sum the read counts of ASVs representing the same species used for inoculation
infection_data <- aligned_data %>%
  filter(mismatch <= innoculum_asv_mismatch_threshold) %>%
  left_join(abundance_prop, by = c(asv_seq = "sequence")) %>%
  group_by(asv_seq) %>% slice_min(mismatch, with_ties = FALSE) # Only consider the best match per ASV

Prepare trimmed tables for subsequent analysis

## Update abundance matrix and metadata with results
inoculum_asv_key <- setNames(infection_data$species, infection_data$asv_seq)
abundance <- mutate(abundance, is_inoculum = sequence %in% infection_data$asv_seq, inoculum = inoculum_asv_key[sequence], .after = "dada2_tax")
write_csv(abundance, file.path('demulticoder/data', 'abundance_with_infection_data.csv'))

#Let's append metadata table with more info
species_name_key <- c(Phytophthora_cinnamomi = "cin_prop", Phytophthora_plurivora = "plu_prop", Pythium_cryptoirregulare = "cry_prop")
sample_infection_data <- infection_data %>%
  group_by(species) %>% summarise_at(metadata$sample_name, sum) %>% # Sum per inoculum species
  gather(key = "sample_name", value = "read_prop", !!! metadata$sample_name) %>%
  mutate(species = species_name_key[species]) %>%
  mutate(read_prop = ifelse(is.nan(read_prop), 0, read_prop)) %>%
  tidyr::spread(key = 'species', value = 'read_prop')
metadata <- metadata %>%
  select(sample_name, plate, well, organism, flooded, path_conc, experiment, sample_type, is_ambiguous) %>% # Avoid duplicating columns from previous runs
  left_join(sample_infection_data, by = "sample_name")

# Get expected inoculum proportion and non-target proportion
metadata$expected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
  if (metadata$sample_type[i] == "Mock community") {
    return(NA)
  } else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
    return(NA)
  } else {
    prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
    return(metadata[[i, prop_col]])
  }
})
metadata$unexpected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
  prop_cols <- c("cin_prop", "cry_prop", "plu_prop")
  if (metadata$sample_type[i] == "Mock community"  || metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
    return(sum(metadata[i, prop_cols]))
  } else {
    expected_prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
    prop_cols <- prop_cols[! prop_cols %in% expected_prop_col]
    return(sum(metadata[i, prop_cols]))
  }
})

# Check if expected inoculum was found
metadata$expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
  all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
  if (metadata$sample_type[i] == "Mock community") {
    return(NA)
  } else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
    return(all_innoc == 0)
  } else {
    prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
    return(metadata[[i, prop_col]] > 0)
  }
})

# Check if ONLY expected inoculum was found
metadata$only_expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
  all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
  if (metadata$sample_type[i] == "Mock community") {
    return(NA)
  } else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
    return(all_innoc == 0)
  } else {
    prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
    return(metadata[[i, prop_col]] > 0 && metadata[[i, prop_col]] == all_innoc)
  }
})

# Look at number of samples with expected inoculum
table(metadata$expected_innoc)
## 
## FALSE  TRUE 
##    88    82
table(metadata$only_expected_innoc)
## 
## FALSE  TRUE 
##   107    63
metadata$valid_inoc <- (is.na(metadata$expected_innoc_prop) | metadata$expected_innoc_prop > 0.0001) & metadata$unexpected_innoc_prop < 0.0001
map_dbl(split(metadata$valid_inoc, metadata$organism), sum)
##     Cin Control     Cry     Plu 
##      26       9      16       8
# Save new metadata
write_csv(metadata, file.path('demulticoder/data', 'metadata_with_infection_data.csv'))

For the sample data, I will add columns for the proportion of reads in each sample representing each inoculum as well as a columns that indicate whether the expected inoculum was found, and if so, if it was the only inoculum found.

# Get per-sample proportions of reads
species_name_key <- c(Phytophthora_cinnamomi = "cin_prop", Phytophthora_plurivora = "plu_prop", Pythium_cryptoirregulare = "cry_prop")
sample_infection_data <- infection_data %>%
  group_by(species) %>% summarise_at(metadata$sample_name, sum) %>% # Sum per inoculum species
  tidyr::gather(key = "sample_name", value = "read_prop", !!! metadata$sample_name) %>%
  mutate(species = species_name_key[species]) %>%
  mutate(read_prop = ifelse(is.nan(read_prop), 0, read_prop)) %>%
  tidyr::spread(key = 'species', value = 'read_prop')
metadata <- metadata %>%
  select(sample_name, plate, well, organism, flooded, path_conc, experiment, sample_type, is_ambiguous) %>% # Avoid duplicating columns from previous runs
  left_join(sample_infection_data, by = "sample_name")

# Get expected inoculum proportion and non-target proportion
metadata$expected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
  if (metadata$sample_type[i] == "Mock community") {
    return(NA)
  } else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
    return(NA)
  } else {
    prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
    return(metadata[[i, prop_col]])
  }
})
metadata$unexpected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
  prop_cols <- c("cin_prop", "cry_prop", "plu_prop")
  if (metadata$sample_type[i] == "Mock community"  || metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
    return(sum(metadata[i, prop_cols]))
  } else {
    expected_prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
    prop_cols <- prop_cols[! prop_cols %in% expected_prop_col]
    return(sum(metadata[i, prop_cols]))
  }
})

# Check if expected inoculum was found
metadata$expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
  all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
  if (metadata$sample_type[i] == "Mock community") {
    return(NA)
  } else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
    return(all_innoc == 0)
  } else {
    prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
    return(metadata[[i, prop_col]] > 0)
  }
})

# Check if ONLY expected inoculum was found
metadata$only_expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
  all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
  if (metadata$sample_type[i] == "Mock community") {
    return(NA)
  } else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
    return(all_innoc == 0)
  } else {
    prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
    return(metadata[[i, prop_col]] > 0 && metadata[[i, prop_col]] == all_innoc)
  }
})

# Look at number of samples with expected inoculum
table(metadata$expected_innoc)
## 
## FALSE  TRUE 
##    88    82
table(metadata$only_expected_innoc)
## 
## FALSE  TRUE 
##   107    63
metadata$valid_inoc <- (is.na(metadata$expected_innoc_prop) | metadata$expected_innoc_prop > 0.0001) & metadata$unexpected_innoc_prop < 0.0001
map_dbl(split(metadata$valid_inoc, metadata$organism), sum)
##     Cin Control     Cry     Plu 
##      26       9      16       8
# Save new metadata
write_csv(metadata, file.path('demulticoder/data', 'metadata_with_infection_data.csv'))

Let’s convert our new matrix and associated metadata table to a taxmap object

metadata <- filter(metadata, ! is_ambiguous)
abundance <- filter(abundance, ! is_inoculum)

#Filter out mock community samples and also any mock community samples as well
metadata <- filter(metadata, sample_type != "Mock community" & sample_type != "Negative control")

obj <- parse_tax_data(abundance, class_cols = 'dada2_tax', class_sep = ';',
                      class_regex = '^(.+)--(.+)--(.+)$',
                      class_key = c(taxon = 'taxon_name', boot = 'info', rank = 'taxon_rank'))
names(obj$data) <- c('abund', 'score')
obj <- transmute_obs(obj, 'score', sequence = sequence[input_index], boot = boot, rank = rank)


# For diversity calculations, we'll use proportions of read depth
# We'll set any proportion to 0 that is less than the inverse of the read count of the non-control sample with the fewest reads.
# This should account for unequal sample read depth without the randomness of rarefaction.

metadata$raw_count <- colSums(obj$data$abund[, metadata$sample_name])
lowest_count <- min(metadata$raw_count[! is.na(metadata$organism)])
lowest_count
## [1] 12676
obj$data$prop <- calc_obs_props(obj, data = 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 162 columns for 2707 observations.
obj$data$prop <- zero_low_counts(obj, data = 'prop', min_count = 1 / lowest_count, cols = metadata$sample_name)
## Zeroing 2166 of 438534 counts less than 7.88892395077311e-05.
obj$data$prop
## # A tibble: 2,707 × 163
##    taxon_id       S1      S10     S100     S101     S102    S103    S104    S105
##    <chr>       <dbl>    <dbl>    <dbl>    <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
##  1 bga      0.0821   0.101    0.00142  0.0396   0.0345   1.90e-3 8.63e-4 1.68e-2
##  2 bgb      0.00867  0.0190   0.0154   0.00757  0.00565  2.95e-2 3.11e-2 8.06e-3
##  3 bgc      0.134    0        0        0.0316   0        0       0       0      
##  4 bgd      0.00482  0        0        0.0694   0.000199 4.26e-4 1.96e-2 2.74e-3
##  5 bge      0.00584  0.00589  0.00680  0.0121   0.00495  7.28e-3 2.41e-2 4.03e-3
##  6 bgf      0.000106 0.0137   0.00337  0.0190   0.00635  1.62e-2 5.25e-3 8.30e-3
##  7 bgg      0.000158 0.000228 0.000778 0.000188 0.000350 4.55e-4 1.23e-3 4.25e-4
##  8 bgh      0.00322  0.0271   0.00405  0.00110  0.00304  2.50e-3 6.02e-3 1.68e-3
##  9 bgc      0        0        0        0        0        0       0       0      
## 10 bgc      0        0        0.00461  0        0        0       0       0      
## # ℹ 2,697 more rows
## # ℹ 154 more variables: S106 <dbl>, S107 <dbl>, S108 <dbl>, S109 <dbl>,
## #   S11 <dbl>, S110 <dbl>, S111 <dbl>, S112 <dbl>, S113 <dbl>, S114 <dbl>,
## #   S115 <dbl>, S116 <dbl>, S117 <dbl>, S118 <dbl>, S119 <dbl>, S12 <dbl>,
## #   S120 <dbl>, S121 <dbl>, S122 <dbl>, S123 <dbl>, S124 <dbl>, S125 <dbl>,
## #   S126 <dbl>, S127 <dbl>, S128 <dbl>, S129 <dbl>, S13 <dbl>, S130 <dbl>,
## #   S131 <dbl>, S132 <dbl>, S133 <dbl>, S134 <dbl>, S135 <dbl>, S136 <dbl>, …
obj$data$prop[metadata$sample_name] <- map(metadata$sample_name, function(id) {
  out <- obj$data$prop[[id]]
  out[is.na(out) | is.nan(out)] <- 0
  out
})

obj$data$prop
## # A tibble: 2,707 × 163
##    taxon_id       S1      S10     S100     S101     S102    S103    S104    S105
##    <chr>       <dbl>    <dbl>    <dbl>    <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
##  1 bga      0.0821   0.101    0.00142  0.0396   0.0345   1.90e-3 8.63e-4 1.68e-2
##  2 bgb      0.00867  0.0190   0.0154   0.00757  0.00565  2.95e-2 3.11e-2 8.06e-3
##  3 bgc      0.134    0        0        0.0316   0        0       0       0      
##  4 bgd      0.00482  0        0        0.0694   0.000199 4.26e-4 1.96e-2 2.74e-3
##  5 bge      0.00584  0.00589  0.00680  0.0121   0.00495  7.28e-3 2.41e-2 4.03e-3
##  6 bgf      0.000106 0.0137   0.00337  0.0190   0.00635  1.62e-2 5.25e-3 8.30e-3
##  7 bgg      0.000158 0.000228 0.000778 0.000188 0.000350 4.55e-4 1.23e-3 4.25e-4
##  8 bgh      0.00322  0.0271   0.00405  0.00110  0.00304  2.50e-3 6.02e-3 1.68e-3
##  9 bgc      0        0        0        0        0        0       0       0      
## 10 bgc      0        0        0.00461  0        0        0       0       0      
## # ℹ 2,697 more rows
## # ℹ 154 more variables: S106 <dbl>, S107 <dbl>, S108 <dbl>, S109 <dbl>,
## #   S11 <dbl>, S110 <dbl>, S111 <dbl>, S112 <dbl>, S113 <dbl>, S114 <dbl>,
## #   S115 <dbl>, S116 <dbl>, S117 <dbl>, S118 <dbl>, S119 <dbl>, S12 <dbl>,
## #   S120 <dbl>, S121 <dbl>, S122 <dbl>, S123 <dbl>, S124 <dbl>, S125 <dbl>,
## #   S126 <dbl>, S127 <dbl>, S128 <dbl>, S129 <dbl>, S13 <dbl>, S130 <dbl>,
## #   S131 <dbl>, S132 <dbl>, S133 <dbl>, S134 <dbl>, S135 <dbl>, S136 <dbl>, …
#Let's modify the metadata sheet
metadata <- metadata %>%
  mutate(organism = fct_relevel(ordered(organism), "Control", "Cin", "Plu", "Cry"))

Alpha diversity

abund_table <- obj$data$abund[metadata$sample_name]
metadata$richness <- vegan::specnumber(abund_table, MARGIN = 2)
metadata$shannon <- vegan::diversity(abund_table, MARGIN = 2, index = "shannon")
metadata$invsimpson <- vegan::diversity(abund_table, MARGIN = 2, index = "invsimpson")
write.csv(metadata, file.path("demulticoder", "results/alpha_diversity.csv"))
print(metadata)
## # A tibble: 162 × 21
##    sample_name plate well  organism flooded path_conc experiment sample_type
##    <chr>       <dbl> <chr> <ord>    <lgl>       <dbl>      <dbl> <chr>      
##  1 S1              1 A01   Cry      TRUE          100          1 Sample     
##  2 S10             1 B02   Cry      TRUE            1          1 Sample     
##  3 S100            2 B02   Cin      TRUE            1          2 Sample     
##  4 S101            2 C02   Plu      FALSE           1          2 Sample     
##  5 S102            2 D02   Cry      TRUE            1          2 Sample     
##  6 S103            2 F02   Cin      FALSE           1          2 Sample     
##  7 S104            2 G02   Plu      FALSE           1          2 Sample     
##  8 S105            2 H02   Cry      TRUE          100          2 Sample     
##  9 S106            2 A03   Plu      FALSE         100          2 Sample     
## 10 S107            2 B03   Cin      TRUE          100          2 Sample     
## # ℹ 152 more rows
## # ℹ 13 more variables: is_ambiguous <lgl>, cin_prop <dbl>, cry_prop <dbl>,
## #   plu_prop <dbl>, expected_innoc_prop <dbl>, unexpected_innoc_prop <dbl>,
## #   expected_innoc <lgl>, only_expected_innoc <lgl>, valid_inoc <lgl>,
## #   raw_count <dbl>, richness <int>, shannon <dbl>, invsimpson <dbl>
plotted_factors <- c('Organism' = 'organism', 'Flooded' = 'flooded', 'Pathogen Concentration' = 'path_conc', 'Trial' = 'experiment')

# Reformat data for plotting
alpha_plot_data <- plotted_factors %>%
  map2_dfr(names(plotted_factors), 
           function(factor, factor_name) {
             out <- metadata
             out$factor <- factor_name
             out$value <- as.character(metadata[[factor]])
             return(out)
           }) %>%
  mutate(path_conc = factor(path_conc, 
                            levels = sort(unique(path_conc)), 
                            labels = paste(sort(unique(path_conc)), 'CFU/g'), 
                            ordered = TRUE)) %>%
  filter(sample_type == 'Sample') %>%
  select(sample_name, factor, value, invsimpson) %>%
  tidyr::gather(key = "index", value = "diversity", -sample_name, -factor, -value) %>%
  mutate(value = forcats::fct_relevel(ordered(value), "Control", "Cin", "Plu", "Cry"))

# ANOVA and Tukey's HSD
anova_and_hsd <- function(x) {
  anova_result <- aov(diversity ~ value, x)
  tukey_result <- agricolae::HSD.test(anova_result, "value", group = TRUE)
  group_data <- tukey_result$groups[order(rownames(tukey_result$groups)),]
  group_key <- setNames(group_data$groups, rownames(group_data))
  group_key[as.character(x$value)]
}
alpha_plot_data$group <- unlist(map(split(alpha_plot_data, alpha_plot_data$factor)[unique(alpha_plot_data$factor)], anova_and_hsd))


alpha_subplot <- ggplot(alpha_plot_data, aes(x = value, y = diversity)) +
  geom_boxplot() +
  geom_text(aes(x = value,
                y = max(diversity) + 2,
                label = group),
            col = 'black',
            size = 5) +
  facet_grid( ~ factor, scales = "free") +
  labs(x = NULL, y = 'Diversity (Inverse Simpson)') +
  guides(color = "none") +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position="bottom")

Beta diversity

set.seed(1)
prob_table <- obj$data$prop[metadata$sample_name]
nmds_plot_data <- function(prob_table) {
  metadata <- metadata[metadata$sample_name %in% colnames(prob_table), ]
  set.seed(1)
  nmds_results <- vegan::metaMDS(t(prob_table), trymax = 1000, k = 2, trace = 0)
  nmds_data <- nmds_results$points %>%
    as_tibble() %>%
    bind_cols(metadata)
  names(nmds_data)[1:2] <- paste0("NMDS", 1:2)
  return(nmds_data)
}

nmds_data <- nmds_plot_data(prob_table[! is.na(metadata$organism) & metadata$valid_inoc])

nmds_factors <- c(Flooded = 'flooded', Organism = 'organism', 'Pathogen CFU/g' = 'path_conc', 'Trial' = 'experiment')

make_one_plot <- function(factor, name) {
  nmds_data %>%
    mutate(factor = as.character(nmds_data[[factor]]),
           NMDS1 = scales::rescale(NMDS1),
           NMDS2 = scales::rescale(NMDS2)) %>%
    mutate(factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu", "Cry")) %>%
    ggplot(aes_string(x = "NMDS1", y = "NMDS2", color = "factor", label = "sample_name")) +
    # geom_label(size = 2) +
    geom_point() +
    coord_fixed() +
    viridis::scale_color_viridis(discrete = TRUE, end = .9) +
    labs(color = NULL, x = NULL, y = NULL) +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.text = element_blank(),
          axis.title = element_text(size = 7), 
          axis.ticks = element_blank(),
          plot.margin = unit(rep(0.04, 4), "cm"),
          # panel.background = element_rect(fill = 'transparent', colour = NA),
          # plot.background = element_rect(fill = "white", colour = NA),
          legend.position = "bottom",
          legend.text = element_text(size = 10),
          legend.key.height = unit(0.5, 'cm'),
          legend.key.width = unit(0.2, 'cm'))
}

nmds_subplots <- map2(nmds_factors, names(nmds_factors), make_one_plot)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
##   "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
##   "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
## There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
##   "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
nmds_plot <- ggpubr::ggarrange(plotlist = c(list(ggplot() + theme_void()), nmds_subplots),
                       nrow = 1, widths = c(0.15, 1, 1, 1, 1))

ggsave(nmds_plot, path = "demulticoder/figures", filename = "nmds.pdf", 
       width = 7, height = 8)
print(nmds_plot)

combined_div_plot <- ggpubr::ggarrange(alpha_subplot, nmds_plot, ncol = 1, labels = c('A', 'B'),
                               heights = c(1, 1))
combined_div_plot

ggsave(combined_div_plot, path = "demulticoder/figures", filename = "diversity.pdf", 
       width = 10, height = 6, bg = "#FFFFFF")

ggsave(combined_div_plot, path = "demulticoder/figures", filename = "diversity.svg", 
       width = 10, height = 6, bg = "#FFFFFF")

Let’s do associated PERMANOVA analyses to better understand significance of differences in diversity between the 3 factors

dist_matrix <- vegdist(t(prob_table), method = "bray")

adonis2_full <- adonis2(dist_matrix ~ organism + flooded + path_conc + experiment, 
                         data = metadata, 
                         permutations = 999,
                         method = "bray")

print(adonis2_full)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_matrix ~ organism + flooded + path_conc + experiment, data = metadata, permutations = 999, method = "bray")
##             Df SumOfSqs      R2      F Pr(>F)    
## organism     3    1.254 0.02384 1.3900  0.058 .  
## flooded      1    1.848 0.03514 6.1469  0.001 ***
## path_conc    1    0.276 0.00524 0.9165  0.517    
## experiment   1    2.614 0.04971 8.6953  0.001 ***
## Residual   155   46.598 0.88608                  
## Total      161   52.590 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2_org <- adonis2(dist_matrix ~ organism, 
                         data = metadata, 
                         permutations = 999,
                         method = "bray")

print(adonis2_org)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_matrix ~ organism, data = metadata, permutations = 999, method = "bray")
##           Df SumOfSqs      R2      F Pr(>F)  
## organism   3    1.254 0.02384 1.2861  0.095 .
## Residual 158   51.336 0.97616                
## Total    161   52.590 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2_flooded <- adonis2(dist_matrix ~ flooded, 
                         data = metadata, 
                         permutations = 999,
                         method = "bray")

print(adonis2_flooded)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_matrix ~ flooded, data = metadata, permutations = 999, method = "bray")
##           Df SumOfSqs      R2      F Pr(>F)    
## flooded    1    1.849 0.03515 5.8294  0.001 ***
## Residual 160   50.741 0.96485                  
## Total    161   52.590 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2_path_conc <- adonis2(dist_matrix ~ path_conc, 
                         data = metadata, 
                         permutations = 999,
                         method = "bray")

print(adonis2_path_conc)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_matrix ~ path_conc, data = metadata, permutations = 999, method = "bray")
##            Df SumOfSqs      R2      F Pr(>F)
## path_conc   1    0.308 0.00586 0.9431   0.46
## Residual  160   52.281 0.99414              
## Total     161   52.590 1.00000
adonis2_experiment <- adonis2(dist_matrix ~ experiment, 
                         data = metadata, 
                         permutations = 999,
                         method = "bray")

print(adonis2_experiment)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_matrix ~ experiment, data = metadata, permutations = 999, method = "bray")
##             Df SumOfSqs      R2      F Pr(>F)    
## experiment   1    2.613 0.04969 8.3655  0.001 ***
## Residual   160   49.977 0.95031                  
## Total      161   52.590 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Let’s make new taxmap object in preparation to make heattree examining abundance and diversity of taxa in at least 5 samples

#Make sure our factors are input properly as factors
metadata$path_conc <- as.factor(metadata$path_conc)
metadata$path_conc<- factor(metadata$path_conc, levels = c("0", "1", "100"))

metadata$organism <- as.factor(metadata$organism)
metadata$organism<- factor(metadata$organism, levels = c("Control", "Cin", "Cry", "Plu"))

metadata$flooded <- as.factor(metadata$flooded)
metadata$flooded <- factor(metadata$flooded, levels = c("TRUE", "FALSE"))

metadata$experiment <- as.factor(metadata$experiment)
metadata$experiment<- factor(metadata$experiment, levels = c("1", "2"))

abundance$is_low_abund <- rowSums(abundance[, metadata$sample_name]) < 5
obj2 <- parse_tax_data(abundance, class_cols = 'dada2_tax', class_sep = ';',
                      class_regex = '^(.+)--(.+)--(.+)$',
                      class_key = c(taxon = 'taxon_name', boot = 'info', rank = 'taxon_rank'))
names(obj2$data) <- c('abund', 'score')
obj2 <- transmute_obs(obj2, 'score', sequence = sequence[input_index], boot = boot, rank = rank)
print(obj2)
## <Taxmap>
##   1573 taxa: aab. Eukaryota, aac. Fungi ... cin. Fungi_sp
##   1573 edges: NA->aab, aab->aac, aab->aad ... cil->cim, cim->cin
##   2 data sets:
##     abund:
##       # A tibble: 2,707 × 180
##         taxon_id sequence   dada2_tax is_inoculum inoculum    S1   S10
##         <chr>    <chr>      <chr>     <lgl>       <chr>    <int> <int>
##       1 bga      AAGTCGTAA… Eukaryot… FALSE       <NA>      4670  3565
##       2 bgb      AAGTCGTAA… Eukaryot… FALSE       <NA>       493   668
##       3 bgc      AAAAAGTCG… Eukaryot… FALSE       <NA>      7603     0
##       # ℹ 2,704 more rows
##       # ℹ 173 more variables: S100 <int>, S101 <int>, S102 <int>,
##       #   S103 <int>, S104 <int>, S105 <int>, S106 <int>, S107 <int>,
##       #   S108 <int>, S109 <int>, …
##     score:
##       # A tibble: 23,852 × 4
##         taxon_id sequence                                  boot  rank 
##         <chr>    <chr>                                     <chr> <chr>
##       1 aab      AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100   Doma…
##       2 aac      AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100   King…
##       3 aae      AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 73    Phyl…
##       # ℹ 23,849 more rows
##   0 functions:
obj2$data$tax_abund <- calc_taxon_abund(obj2, data = "abund", cols = metadata$sample_name)
## Summing per-taxon counts from 162 columns for 1573 taxa
obj2$data$abund_prop <- calc_obs_props(obj2, data = "abund", cols = metadata$sample_name, groups = rep("tax_prop", nrow(metadata)))
## Calculating proportions from counts for 162 columns in 1 groups for 2707 observations.
obj2$data$tax_prop <- calc_taxon_abund(obj2, data = "abund_prop", cols = "tax_prop")
## Summing per-taxon counts from 1 columns for 1573 taxa
obj2$data$abund_prop <- NULL

obj_subset <- obj2 %>%
  filter_taxa(taxon_ranks == "Species", supertaxa = TRUE, reassign_obs = c(tax_abund = FALSE)) 
min_bootstrap <- 0

obj_subset$data$score$boot <- as.numeric(obj_subset$data$score$boot)
max_boot <- obj_subset$data$score %>%
  group_by(taxon_id) %>%
  summarise(max = max(boot))
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
obj_subset <- filter_taxa(obj_subset, max_boot[taxon_ids] >= min_bootstrap | taxon_ranks %in% c("ASV", "Reference"), reassign_obs = c(abund = TRUE, score = FALSE))

Let’s make differential abundance plots for our combined dataset-first let’s look at diff. abundance of taxa by inoculum concentration

## [1] 2.11993e-11 1.00000e+00
## [1] -9.832517 29.140832

Let’s make differential abundance plots for our combined dataset-first let’s look at diff. abundance of taxa by whether plots were flooded or not

## [1] 3.934637e-21 9.957609e-01
## [1] -24.80020   5.15338

Let’s make differential abundance plots for our combined dataset-first let’s look at diff. abundance of taxa by inoculum source

## [1] 7.212696e-51 1.000000e+00
## [1] -42.16813  43.59079

Let’s make differential abundance plots for our combined dataset-first let’s look at diff. abundance of taxa by whether pots were in trial 1 or 2

## [1] 1.807349e-190  9.952921e-01
## [1] -26.94927  25.83220

Let’s try to facet these plots.

## quartz_off_screen 
##                 2

Let’s finally make an exploratory heat tree to look at community composition across all samples, just retaining ASVs present at least 10 times

Overall community plot

obj_subset <- obj2 %>%
  filter_taxa(taxon_ranks == "Species", supertaxa = TRUE, reassign_obs = c(tax_abund = FALSE)) 
min_bootstrap <- 60

obj_subset$data$score$boot <- as.numeric(obj_subset$data$score$boot)
max_boot <- obj_subset$data$score %>%
  group_by(taxon_id) %>%
  summarise(max = max(boot))
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
obj_subset <- filter_taxa(obj_subset, max_boot[taxon_ids] >= min_bootstrap | taxon_ranks %in% c("ASV", "Reference"), reassign_obs = c(abund = TRUE, score = FALSE))

obj_subset$data$asv_prop <- calc_obs_props(obj_subset, 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 162 columns for 2707 observations.
obj_subset$data$tax_abund <- calc_taxon_abund(obj_subset, 'abund', cols = metadata$sample_name)
## Summing per-taxon counts from 162 columns for 1130 taxa
obj_subset$data$tax_prop <- calc_taxon_abund(obj_subset, 'asv_prop', cols = metadata$sample_name)
## Summing per-taxon counts from 162 columns for 1130 taxa
obj_subset$data$tax_data <- calc_n_samples(obj_subset, 'tax_prop', cols = metadata$sample_name[metadata$sample_type == 'Sample'])
## Calculating number of samples with a value greater than 0 for 162 columns for 1130 observations
obj_subset$data$tax_data$mean_prop <- rowMeans(obj_subset$data$tax_prop[, metadata$sample_name])

# Calculate the relative standard deviation for each taxon as a measure of how consistently it was found.
rsd <- function(x, na.rm = FALSE) {sd(x, na.rm = na.rm) / mean(x, na.rm = na.rm)}
obj_subset$data$tax_data$rel_stand_dev <- map_dbl(1:nrow(obj_subset$data$tax_abund), function(i) {
  rsd(unlist(obj_subset$data$tax_abund[i, metadata$sample_name]), na.rm = TRUE)
})
obj_subset
## <Taxmap>
##   1130 taxa: aab. Eukaryota, aac. Fungi ... cin. Fungi_sp
##   1130 edges: NA->aab, aab->aac, aab->aad ... cil->cim, cim->cin
##   6 data sets:
##     abund:
##       # A tibble: 2,707 × 180
##         taxon_id sequence   dada2_tax is_inoculum inoculum    S1   S10
##         <chr>    <chr>      <chr>     <lgl>       <chr>    <int> <int>
##       1 acm      AAGTCGTAA… Eukaryot… FALSE       <NA>      4670  3565
##       2 bgb      AAGTCGTAA… Eukaryot… FALSE       <NA>       493   668
##       3 bgc      AAAAAGTCG… Eukaryot… FALSE       <NA>      7603     0
##       # ℹ 2,704 more rows
##       # ℹ 173 more variables: S100 <int>, S101 <int>, S102 <int>,
##       #   S103 <int>, S104 <int>, S105 <int>, S106 <int>, S107 <int>,
##       #   S108 <int>, S109 <int>, …
##     score:
##       # A tibble: 22,959 × 4
##         taxon_id sequence                                   boot rank 
##         <chr>    <chr>                                     <dbl> <chr>
##       1 aab      AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA…   100 Doma…
##       2 aac      AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA…   100 King…
##       3 aae      AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA…    73 Phyl…
##       # ℹ 22,956 more rows
##     tax_abund:
##       # A tibble: 1,130 × 163
##         taxon_id    S1   S10  S100  S101  S102  S103  S104  S105  S106
##         <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##       1 aab      56860 35128 50111 42655 85627 70347 52167 54109 51792
##       2 aac      27729 12225 11118 13060 18474 19097 10151 11127 12198
##       3 aad      29131 22903 38993 29595 67153 51250 42016 42982 39594
##       # ℹ 1,127 more rows
##       # ℹ 153 more variables: S107 <dbl>, S108 <dbl>, S109 <dbl>,
##       #   S11 <dbl>, S110 <dbl>, S111 <dbl>, S112 <dbl>, S113 <dbl>,
##       #   S114 <dbl>, S115 <dbl>, …
##     tax_prop:
##       # A tibble: 1,130 × 163
##         taxon_id    S1   S10  S100  S101  S102  S103  S104  S105  S106
##         <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##       1 aab      1     1     1     1     1     1     1     1     1    
##       2 aac      0.488 0.348 0.222 0.306 0.216 0.271 0.195 0.206 0.236
##       3 aad      0.512 0.652 0.778 0.694 0.784 0.729 0.805 0.794 0.764
##       # ℹ 1,127 more rows
##       # ℹ 153 more variables: S107 <dbl>, S108 <dbl>, S109 <dbl>,
##       #   S11 <dbl>, S110 <dbl>, S111 <dbl>, S112 <dbl>, S113 <dbl>,
##       #   S114 <dbl>, S115 <dbl>, …
##     asv_prop:
##       # A tibble: 2,707 × 163
##         taxon_id      S1    S10    S100    S101    S102    S103
##         <chr>      <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##       1 acm      0.0821  0.101  0.00142 0.0396  0.0345  0.00190
##       2 bgb      0.00867 0.0190 0.0154  0.00757 0.00565 0.0295 
##       3 bgc      0.134   0      0       0.0316  0       0      
##       # ℹ 2,704 more rows
##       # ℹ 156 more variables: S104 <dbl>, S105 <dbl>, S106 <dbl>,
##       #   S107 <dbl>, S108 <dbl>, S109 <dbl>, S11 <dbl>, S110 <dbl>,
##       #   S111 <dbl>, S112 <dbl>, …
##     tax_data:
##       # A tibble: 1,130 × 4
##         taxon_id n_samples mean_prop rel_stand_dev
##         <chr>        <int>     <dbl>         <dbl>
##       1 aab            162     1             0.443
##       2 aac            162     0.416         0.652
##       3 aad            162     0.584         0.711
##       # ℹ 1,127 more rows
##   0 functions:
set.seed(1000)
obj_subset %>%
  filter_taxa(! is_stem) %>%
  filter_taxa(n_samples >= 10, supertaxa = TRUE, reassign_obs = FALSE) %>%
  filter_taxa(! grepl(x = taxon_names, "_sp$"), reassign_obs = FALSE) %>%
  filter_taxa(! grepl(x = taxon_names, "incertae_sedis", ignore.case = TRUE), reassign_obs = FALSE) %>%
  remove_redundant_names() %>%
  heat_tree(node_size = mean_prop,
            edge_size = n_samples,
            node_color = ifelse(is.na(rel_stand_dev), 0, rel_stand_dev),
            #node_color_range = c("#aaaaaa", "#8da0cb", "#66c2a5", "#a6d854", "#fc8d62", "red"),
            node_label = taxon_names,
            node_size_range = c(0.008, 0.025),
            node_label_size_range = c(0.012, 0.018),
            edge_label_size_range = c(0.010, 0.013),
            node_size_interval = c(0, 1),
            edge_size_range = c(0.001, 0.008),
            #layout = "da", initial_layout = "re",
            node_color_axis_label = "Relative standard deviation",
            node_size_axis_label = "Mean proportion of reads",
            edge_size_axis_label = "Number of samples",
            node_color_digits = 2,
            node_size_digits = 2,
            edge_color_digits = 2,
            edge_size_digits = 2,
            #aspect_ratio = 1.618,
            output_file = file.path('demulticoder/figures', '/heattree_mostabund_taxa.pdf'))

sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.3.2 (2023-10-31)
##  os       macOS Ventura 13.2.1
##  system   aarch64, darwin20
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       America/Los_Angeles
##  date     2024-11-22
##  pandoc   3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package              * version    date (UTC) lib source
##  abind                  1.4-5      2016-07-21 [1] CRAN (R 4.3.0)
##  ade4                   1.7-22     2023-02-06 [1] CRAN (R 4.3.0)
##  agricolae              1.3-7      2023-10-22 [1] CRAN (R 4.3.1)
##  AlgDesign              1.2.1      2022-05-25 [1] CRAN (R 4.3.0)
##  ape                    5.8        2024-04-11 [1] CRAN (R 4.3.1)
##  askpass                1.2.0      2023-09-03 [1] CRAN (R 4.3.0)
##  backports              1.5.0      2024-05-23 [1] CRAN (R 4.3.3)
##  Biobase                2.62.0     2023-10-26 [1] Bioconductor
##  BiocGenerics         * 0.48.1     2023-11-02 [1] Bioconductor
##  BiocParallel           1.36.0     2023-10-26 [1] Bioconductor
##  biomformat             1.30.0     2023-10-26 [1] Bioconductor
##  Biostrings           * 2.70.3     2024-04-03 [1] bioc_xgit (@c213e35)
##  bit                    4.5.0      2024-09-20 [1] CRAN (R 4.3.3)
##  bit64                  4.5.2      2024-09-22 [1] CRAN (R 4.3.3)
##  bitops                 1.0-8      2024-07-29 [1] CRAN (R 4.3.3)
##  broom                  1.0.6      2024-05-17 [1] CRAN (R 4.3.3)
##  bslib                  0.8.0      2024-07-29 [1] CRAN (R 4.3.3)
##  cachem                 1.1.0      2024-05-16 [1] CRAN (R 4.3.2)
##  car                    3.1-2      2023-03-30 [1] CRAN (R 4.3.0)
##  carData                3.0-5      2022-01-06 [1] CRAN (R 4.3.0)
##  cli                    3.6.3      2024-06-21 [1] CRAN (R 4.3.2)
##  cluster                2.1.6      2023-12-01 [1] CRAN (R 4.3.1)
##  codetools              0.2-20     2024-03-31 [1] CRAN (R 4.3.1)
##  colorspace             2.1-1      2024-07-26 [1] CRAN (R 4.3.3)
##  cowplot                1.1.3      2024-01-22 [1] CRAN (R 4.3.1)
##  crayon                 1.5.3      2024-06-20 [1] CRAN (R 4.3.3)
##  data.table           * 1.15.4     2024-03-30 [1] CRAN (R 4.3.1)
##  decontam             * 1.22.0     2023-10-26 [1] Bioconductor
##  DelayedArray           0.28.0     2023-11-06 [1] Bioconductor
##  DESeq2                 1.42.1     2024-03-09 [1] Bioconductor 3.18 (R 4.3.3)
##  digest                 0.6.37     2024-08-19 [1] CRAN (R 4.3.3)
##  dplyr                * 1.1.4      2023-11-17 [1] CRAN (R 4.3.1)
##  evaluate               1.0.1      2024-10-10 [1] CRAN (R 4.3.3)
##  fansi                  1.0.6      2023-12-08 [1] CRAN (R 4.3.1)
##  farver                 2.1.2      2024-05-13 [1] CRAN (R 4.3.3)
##  fastmap                1.2.0      2024-05-15 [1] CRAN (R 4.3.2)
##  forcats              * 1.0.0      2023-01-29 [1] CRAN (R 4.3.0)
##  foreach                1.5.2      2022-02-02 [1] CRAN (R 4.3.0)
##  furrr                * 0.3.1      2022-08-15 [1] CRAN (R 4.3.0)
##  future               * 1.34.0     2024-07-29 [1] CRAN (R 4.3.3)
##  generics               0.1.3      2022-07-05 [1] CRAN (R 4.3.0)
##  GenomeInfoDb         * 1.38.8     2024-03-16 [1] Bioconductor 3.18 (R 4.3.3)
##  GenomeInfoDbData       1.2.11     2023-11-14 [1] Bioconductor
##  GenomicRanges          1.54.1     2023-10-30 [1] Bioconductor
##  ggfittext              0.10.2     2024-02-01 [1] CRAN (R 4.3.1)
##  ggplot2              * 3.5.1      2024-04-23 [1] CRAN (R 4.3.1)
##  ggpubr                 0.6.0      2023-02-10 [1] CRAN (R 4.3.0)
##  ggsignif               0.6.4      2022-10-13 [1] CRAN (R 4.3.0)
##  globals                0.16.3     2024-03-08 [1] CRAN (R 4.3.1)
##  glue                   1.8.0      2024-09-30 [1] CRAN (R 4.3.3)
##  gridExtra            * 2.3        2017-09-09 [1] CRAN (R 4.3.0)
##  gtable                 0.3.5      2024-04-22 [1] CRAN (R 4.3.1)
##  highr                  0.11       2024-05-26 [1] CRAN (R 4.3.3)
##  hms                    1.1.3      2023-03-21 [1] CRAN (R 4.3.0)
##  htmltools              0.5.8.1    2024-04-04 [1] CRAN (R 4.3.1)
##  igraph                 2.0.3      2024-03-13 [1] CRAN (R 4.3.1)
##  IRanges              * 2.36.0     2023-10-26 [1] Bioconductor
##  iterators              1.0.14     2022-02-05 [1] CRAN (R 4.3.0)
##  jquerylib              0.1.4      2021-04-26 [1] CRAN (R 4.3.0)
##  jsonlite               1.8.8      2023-12-04 [1] CRAN (R 4.3.1)
##  knitr                  1.48       2024-07-07 [1] CRAN (R 4.3.3)
##  labeling               0.4.3      2023-08-29 [1] CRAN (R 4.3.0)
##  lattice              * 0.22-6     2024-03-20 [1] CRAN (R 4.3.1)
##  lazyeval               0.2.2      2019-03-15 [1] CRAN (R 4.3.0)
##  lifecycle              1.0.4      2023-11-07 [1] CRAN (R 4.3.1)
##  listenv                0.9.1      2024-01-29 [1] CRAN (R 4.3.1)
##  locfit                 1.5-9.10   2024-06-24 [1] CRAN (R 4.3.3)
##  magick               * 2.8.4      2024-07-14 [1] CRAN (R 4.3.3)
##  magrittr               2.0.3      2022-03-30 [1] CRAN (R 4.3.0)
##  MASS                   7.3-60.0.1 2024-01-13 [1] CRAN (R 4.3.1)
##  Matrix                 1.6-5      2024-01-11 [1] CRAN (R 4.3.1)
##  MatrixGenerics         1.14.0     2023-10-26 [1] Bioconductor
##  matrixStats            1.3.0      2024-04-11 [1] CRAN (R 4.3.1)
##  metacoder            * 0.3.7      2024-02-20 [1] CRAN (R 4.3.1)
##  mgcv                   1.9-1      2023-12-21 [1] CRAN (R 4.3.1)
##  multtest               2.58.0     2023-10-26 [1] Bioconductor
##  munsell                0.5.1      2024-04-01 [1] CRAN (R 4.3.1)
##  nlme                   3.1-165    2024-06-06 [1] CRAN (R 4.3.3)
##  parallelly             1.38.0     2024-07-27 [1] CRAN (R 4.3.3)
##  pdftools             * 3.4.1      2024-09-20 [1] CRAN (R 4.3.3)
##  permute              * 0.9-7      2022-01-27 [1] CRAN (R 4.3.0)
##  phyloseq             * 1.46.0     2024-04-03 [1] bioc_xgit (@7320133)
##  pillar                 1.9.0      2023-03-22 [1] CRAN (R 4.3.0)
##  pkgconfig              2.0.3      2019-09-22 [1] CRAN (R 4.3.0)
##  plyr                   1.8.9      2023-10-02 [1] CRAN (R 4.3.1)
##  purrr                * 1.0.2      2023-08-10 [1] CRAN (R 4.3.0)
##  qpdf                   1.3.4      2024-10-04 [1] CRAN (R 4.3.3)
##  R6                     2.5.1      2021-08-19 [1] CRAN (R 4.3.0)
##  ragg                   1.3.2      2024-05-15 [1] CRAN (R 4.3.3)
##  Rcpp                   1.0.13     2024-07-17 [1] CRAN (R 4.3.2)
##  RCurl                  1.98-1.16  2024-07-11 [1] CRAN (R 4.3.3)
##  readr                * 2.1.5      2024-01-10 [1] CRAN (R 4.3.1)
##  reshape2               1.4.4      2020-04-09 [1] CRAN (R 4.3.0)
##  rhdf5                  2.46.1     2023-12-02 [1] Bioconductor 3.18 (R 4.3.2)
##  rhdf5filters           1.14.1     2023-11-06 [1] Bioconductor
##  Rhdf5lib               1.24.2     2024-02-10 [1] Bioconductor 3.18 (R 4.3.2)
##  rlang                  1.1.4      2024-06-04 [1] CRAN (R 4.3.2)
##  rmarkdown              2.27       2024-05-17 [1] CRAN (R 4.3.3)
##  rstatix                0.7.2      2023-02-01 [1] CRAN (R 4.3.0)
##  rstudioapi             0.16.0     2024-03-24 [1] CRAN (R 4.3.1)
##  S4Arrays               1.2.1      2024-03-05 [1] Bioconductor 3.18 (R 4.3.2)
##  S4Vectors            * 0.40.2     2023-11-25 [1] Bioconductor 3.18 (R 4.3.2)
##  sass                   0.4.9      2024-03-15 [1] CRAN (R 4.3.1)
##  scales                 1.3.0      2023-11-28 [1] CRAN (R 4.3.1)
##  sessioninfo            1.2.2      2021-12-06 [1] CRAN (R 4.3.0)
##  SparseArray            1.2.4      2024-02-10 [1] Bioconductor 3.18 (R 4.3.2)
##  stringi                1.8.4      2024-05-06 [1] CRAN (R 4.3.2)
##  stringr              * 1.5.1      2023-11-14 [1] CRAN (R 4.3.1)
##  SummarizedExperiment   1.32.0     2023-11-06 [1] Bioconductor
##  survival               3.7-0      2024-06-05 [1] CRAN (R 4.3.3)
##  svglite                2.1.3      2023-12-08 [1] CRAN (R 4.3.1)
##  systemfonts            1.1.0      2024-05-15 [1] CRAN (R 4.3.3)
##  textshaping            0.4.0      2024-05-24 [1] CRAN (R 4.3.3)
##  tibble                 3.2.1      2023-03-20 [1] CRAN (R 4.3.0)
##  tidyr                * 1.3.1      2024-01-24 [1] CRAN (R 4.3.1)
##  tidyselect             1.2.1      2024-03-11 [1] CRAN (R 4.3.1)
##  tzdb                   0.4.0      2023-05-12 [1] CRAN (R 4.3.0)
##  utf8                   1.2.4      2023-10-22 [1] CRAN (R 4.3.1)
##  vctrs                  0.6.5      2023-12-01 [1] CRAN (R 4.3.1)
##  vegan                * 2.6-6.1    2024-05-21 [1] CRAN (R 4.3.3)
##  viridis                0.6.5      2024-01-29 [1] CRAN (R 4.3.1)
##  viridisLite            0.4.2      2023-05-02 [1] CRAN (R 4.3.0)
##  vroom                  1.6.5      2023-12-05 [1] CRAN (R 4.3.1)
##  withr                  3.0.1      2024-07-31 [1] CRAN (R 4.3.2)
##  xfun                   0.48       2024-10-03 [1] CRAN (R 4.3.3)
##  XVector              * 0.42.0     2023-10-26 [1] Bioconductor
##  yaml                   2.3.10     2024-07-26 [1] CRAN (R 4.3.3)
##  zlibbioc               1.48.2     2024-03-19 [1] Bioconductor 3.18 (R 4.3.3)
## 
##  [1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
## 
## ──────────────────────────────────────────────────────────────────────────────